Load data
Host label to host groups
count.data <- read_csv('data/resistome_uc90_v2.csv')
## New names:
## * `` -> ...1
## Rows: 2652971 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): run_accession, refSequence
## dbl (3): ...1, clust_number, fragmentCountAln
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# query for meta data
# select * from Meta_public;
meta.data <- read_tsv('data/metadata.tsv', na = c("NULL", "")) %>%
mutate(
host = case_when(
is.na(host) ~ 'Mising',
TRUE ~ host
)
) %>%
left_join(enframe(host2group), by=c("host" = "name")) %>%
rename("hostGroup" = "value")
## New names:
## * `` -> ...1
## Rows: 214022 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): run_accession, host, instrument_platform, country, continent
## dbl (7): ...1, year, latitude, longitude, raw_reads, raw_bases, trimmed_frag...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all.data <- meta.data %>%
inner_join(count.data, by=c("run_accession")) %>%
select(c(run_accession, host, hostGroup, year, latitude, longitude, country, instrument_platform, raw_reads, refSequence, fragmentCountAln, trimmed_fragments))
host.gene.counts <- all.data %>%
group_by(hostGroup, refSequence) %>%
summarise(fragmentCountAln=sum(fragmentCountAln))
## `summarise()` has grouped output by 'hostGroup'. You can override using the `.groups` argument.
all.gene.counts <- all.data %>%
group_by(refSequence) %>%
summarise(fragmentCountAln=sum(fragmentCountAln)) %>%
mutate(hostGroup = 'all') %>%
select(hostGroup, refSequence, fragmentCountAln)
trimmedFrag.hosts <- rbind(
all.data %>%
distinct(hostGroup, run_accession, host, trimmed_fragments) %>%
group_by(hostGroup) %>%
summarise(trimmed_fragments=sum(trimmed_fragments)),
all.data %>%
distinct(hostGroup, run_accession, host, trimmed_fragments) %>%
summarise(trimmed_fragments=sum(trimmed_fragments)) %>%
mutate(hostGroup='all')) %>%
filter(!is.na(hostGroup))
gene.counts <- rbind(all.gene.counts, host.gene.counts) %>%
left_join(trimmedFrag.hosts, by=c("hostGroup")) %>%
mutate(FPKM = fragmentCountAln / (trimmed_fragments / 1e6))
Make network files
resClasses <- get_resClasses()
## Loading required package: RMariaDB
makeFiles <- T
if (makeFiles) {
corrFiles <- list.files(path='data/res_minF50_minS10', pattern=glob2rx("*_corr.npy"), full.names = T)
pvalFiles <- list.files(path='data/res_minF50_minS10', pattern=glob2rx("*_pval.npy"), full.names = T)
colFiles <- list.files(path='data/res_minF50_minS10', pattern=glob2rx("*_columns.list"), full.names = T)
assertthat::assert_that(all(length(corrFiles) > 0,c(length(corrFiles) == length(pvalFiles), length(corrFiles) == length(colFiles), length(pvalFiles) == length(colFiles))))
alpha = 0.01
min_corr = 0.6
all.matrices <- list()
for (i in 1:length(corrFiles)) {
mat <- dataLoader(
corrFile=corrFiles[i],
pvalFile=pvalFiles[i],
colFile=colFiles[i]
)
host <- get_host(str_replace(corrFiles[i], '_sparr_corr.npy', ''))
all.matrices <- c(all.matrices, setNames(list(mat), host))
prefix = str_replace(basename(corrFiles[i]), '_sparr_corr.npy', '')
graphFile = file.path('output/networks_sparcc_frag50_minn10', paste0(prefix, '_', min_corr))
print(paste("Writing network json file to", graphFile))
graph <- make_net(mat, resClasses, alpha, min_corr, graphFile)
}
}
## [1] "Writing network json file to output/networks_sparcc_frag50_minn10/sparcc_air_0.6"
## [1] "Writing network json file to output/networks_sparcc_frag50_minn10/sparcc_all_0.6"
## [1] "Writing network json file to output/networks_sparcc_frag50_minn10/sparcc_canis_lupus_0.6"
## [1] "Writing network json file to output/networks_sparcc_frag50_minn10/sparcc_chicken_0.6"
## [1] "Writing network json file to output/networks_sparcc_frag50_minn10/sparcc_cow_0.6"
## [1] "Writing network json file to output/networks_sparcc_frag50_minn10/sparcc_freshwater_0.6"
## [1] "Writing network json file to output/networks_sparcc_frag50_minn10/sparcc_homo_sapiens_0.6"
## [1] "Writing network json file to output/networks_sparcc_frag50_minn10/sparcc_marine_0.6"
## [1] "Writing network json file to output/networks_sparcc_frag50_minn10/sparcc_mouse_0.6"
## [1] "Writing network json file to output/networks_sparcc_frag50_minn10/sparcc_pig_0.6"
## [1] "Writing network json file to output/networks_sparcc_frag50_minn10/sparcc_soil_0.6"
# Grab names for all the different network files and load them
netFiles <- list.files(path='/Users/hanmar/repos/global-resistome2/output/networks_sparcc_frag50_minn10', pattern = glob2rx('sparcc_*_0.6.json'), full.names = T)
networks <- list()
for (netFile in netFiles) {
host <- get_host(netFile)
if (!host %in% hosts) {
next
}
# load graph file
G <- importGraph(netFile) %>%
activate(edges) %>%
mutate(weight=corr) %>%
select(-c(Var1_class, Var2_class)) %>%
activate(nodes) %>%
select(-ResFinder_class) %>%
left_join(resClasses, by=c("name"))
networks <- c(networks, setNames(list(G), host))
}
Main figures and tables
Table 1: Grouped labels for host and environments
rbind(
meta.data %>% group_by(hostGroup) %>% summarise(`Number of samples`=n()),
meta.data %>% summarise(`Number of samples`=n()) %>% mutate(hostGroup = 'all')
) %>% left_join(
rbind(
all.data %>% group_by(hostGroup) %>% summarise(`Number of samples with ARGs`=n_distinct(run_accession)),
all.data %>% summarise(`Number of samples with ARGs`=n_distinct(run_accession)) %>% mutate(hostGroup = 'all')
),
by='hostGroup'
) %>%
mutate(
hostGroup = str_to_title(case_when(
hostGroup == 'canis_lupus' ~ 'Dog',
hostGroup == 'homo_sapiens' ~ 'Human',
TRUE ~ hostGroup
)
))
## # A tibble: 12 × 3
## hostGroup `Number of samples` `Number of samples with ARGs`
## <chr> <int> <int>
## 1 Air 914 870
## 2 Dog 3439 3182
## 3 Chicken 1223 1215
## 4 Cow 886 824
## 5 Freshwater 4494 585
## 6 Human 95003 57239
## 7 Marine 30002 5444
## 8 Mouse 3947 3909
## 9 Pig 3229 3171
## 10 Soil 6533 2822
## 11 <NA> 64352 39945
## 12 All 214022 119206
Table 2: Relative abundance of read fragments
get_host_order <- function(x){return(which(x == hosts))}
sum.gene.counts <- gene.counts %>%
filter(!is.na(hostGroup)) %>%
left_join(resClasses, by=c("refSequence"="name")) %>%
group_by(hostGroup, ResFinder_class) %>%
summarise(fragmentCountAln=sum(fragmentCountAln), nGeneClass = n_distinct(refSequence)) #%>%
#pivot_wider(id_cols=hostGroup, names_from=ResFinder_class, values_from=fragmentCountAln)
sum.gene.counts2 <- sum.gene.counts %>%
group_by(hostGroup) %>%
summarise(sumVar=sum(fragmentCountAln), nGene=sum(nGeneClass)) %>%
left_join(sum.gene.counts, by=c("hostGroup")) %>%
mutate(
fragClosed = (100 / sumVar) * fragmentCountAln,
hostOrder = mapply(get_host_order, hostGroup),
hostGroup2 = str_to_title(case_when(
hostGroup == 'canis_lupus' ~ 'Dog',
hostGroup == 'homo_sapiens' ~ 'Human',
TRUE ~ hostGroup))
)
resClasses %>%
filter(name %in% unique(count.data$refSequence)) %>%
group_by(ResFinder_class) %>%
summarise(n=n()) %>%
mutate(label=paste0(ResFinder_class, ' (', n, ')')) %>%
right_join(sum.gene.counts2, by=c('ResFinder_class' = 'ResFinder_class')) %>%
pivot_wider(id_cols='label', names_from='hostGroup2', values_from='fragClosed') %>%
write_tsv(file='figs/table_class_rel_abundances.tsv')
Figure 1: Overview of networks
For Figure 1a: Each network is visualized by using the
Fruchterman-Reingold layout and save as a pdf/png/tiff.
source.network.plots <- list()
all.network <- NULL
for (host in names(networks)) {
G <- networks[[host]]
if(host == 'homo_sapiens') {
host = 'human'
} else if (host == 'canis_lupus') {
host = 'dog'
}
plottitle <- str_to_title(str_replace(host, '_', ' '))
# define layout
e <- get.edgelist(G, names=F)
layout <- qgraph.layout.fruchtermanreingold(e, vcount = vcount(G),
area = 6*(vcount(G)^2), repulse.rad = vcount(G)^2.5)
# make visualizations with and without legends
graph.plot.leg <- ggraph(G, layout="manual", x=layout[,1], y=layout[,2]) +
geom_edge_link0(aes(color=corr, width=corr, alpha=corr)) +
geom_node_point(aes(color=ResFinder_class)) +
scale_edge_color_viridis("Correlation",direction=-1, limits=c(0.6,1)) +
scale_edge_width(range=c(.2, 1), guide="none") +
scale_edge_alpha(range=c(0.5, 0.9), guide="none") +
scale_color_manual("ResFinder class", values=classes_colors) +
# scale_shape_manual("Classified\nas CIA", values=c('0' = 15, '1'=17)) +
ggtitle(plottitle) +
theme_graph(base_family="sans")
ggsave(plot=graph.plot.leg, filename=paste0('figs/network_', host, '.png'), width=15, height=10)
graph.plot.noleg <- ggraph(G, layout="manual", x=layout[,1], y=layout[,2]) +
geom_edge_link0(aes(color=corr, width=corr, alpha=corr)) +
geom_node_point(aes(color=ResFinder_class)) +
scale_edge_color_viridis(direction=-1, guide="none", limits=c(0.6, 1)) +
scale_edge_width(range=c(.1, 1), guide="none") +
scale_edge_alpha(range=c(0.5, 0.9), guide="none") +
scale_color_manual(values=classes_colors, guide="none") +
ggtitle(plottitle) +
theme_graph(title_size = 14, base_family="sans")
ggsave(plot=graph.plot.noleg, filename=paste0('figs/network_', host, '_noleg.png'), width=10, height=10)
graph.plot.noleg.cia <- ggraph(G, layout="manual", x=layout[,1], y=layout[,2]) +
geom_edge_link0(aes(color=corr, width=corr, alpha=corr)) +
geom_node_point(aes(color=ResFinder_class)) +
scale_edge_color_viridis(direction=-1, limits=c(0.6,1), guide="none") +
scale_edge_width(range=c(.2, 1), guide="none") +
scale_edge_alpha(range=c(0.5, 0.9), guide="none") +
scale_color_manual(values=classes_colors, guide="none") +
# scale_shape_manual(values=c('0' = 15, '1'=17), guide="none") +
ggtitle(plottitle) +
theme_graph(base_family="sans")
ggsave(plot=graph.plot.noleg.cia, filename=paste0('figs/network_', host, '_noleg_cia.png'), width=10, height=10)
# with arg names on it
graph.plot.anno <- G %>%
activate(nodes) %>%
mutate(name2 = sub('_[^_]+$', '', name)) %>%
ggraph(layout="manual", x=layout[,1], y=layout[,2]) +
geom_edge_link0(aes(color=corr, width=corr, alpha=corr)) +
geom_node_point(aes(color=ResFinder_class)) +
geom_node_text(aes(label=name2), size=2, repel = T, check_overlap = T) +
scale_edge_color_viridis("Correlation",direction=-1, limits=c(0.6,1)) +
scale_edge_width(range=c(.2, 1), guide="none") +
scale_edge_alpha(range=c(0.7, 0.9), guide="none") +
scale_color_manual("ResFinder class", values=classes_colors) +
ggtitle(plottitle) +
theme_graph(title_size = 14, base_family="sans")
ggsave(plot=graph.plot.anno, filename=paste0('figs/network_', host, '_anno.png'), width=15, height=10)
if(host == 'all') {
all.network <- graph.plot.leg
} else {
source.network.plots <- c(source.network.plots, setNames(list(graph.plot.noleg.cia), host))
}
}
p.sources <- ggarrange(plotlist=source.network.plots[order(names(source.network.plots))], nrow=2, ncol=ceiling(length(source.network.plots)/2), common.legend = T)
p.combined <- ggarrange(all.network, p.sources, ncol=2, common.legend = T, legend='bottom', widths=c(.5, 1), labels='a')
ggsave(plot=p.combined, filename='figs/fig1a_network_combined.png', width=22, height=8)
ggsave(plot=p.combined, filename='figs/fig1a_network_combined.pdf', width=22, height=8)
ggsave(plot=p.combined, filename='figs/fig1a_network_combined.tiff', width=22, height=8)
For Figure 1b: Summarise global properties of the different
networks
network.characteristics <- tibble()
for (host in names(networks)) {
G <- networks[[host]] #importGraph(netFile)
net.characteristics <- summarise.graph(G) %>%
mutate(host=host, sort_order=which(hosts == host))
network.characteristics <- rbind(network.characteristics, net.characteristics)
}
network.characteristics <- network.characteristics %>%
mutate(
host = str_to_title(case_when(
host == 'canis_lupus' ~ 'Dog',
host == 'homo_sapiens' ~ 'Human',
TRUE ~ host
))
)
network.characteristics.final <- network.characteristics %>%
mutate(
global_clustering_coefficient = round(global_clustering_coefficient, 3),
edge_density = round(edge_density, 3),
network_density = round(network_density,3)
) %>%
arrange(sort_order) %>%
select(host, number_of_nodes, number_of_edges, global_clustering_coefficient, network_density, edge_density, number_of_components)
(p.netchars <- ggtexttable(
network.characteristics.final,
cols = c(
str_wrap('Network', width=7),
str_wrap('No. nodes', width=7),
str_wrap('No. edges', width=7),
str_wrap('Global clustering coefficient', width=17),
str_wrap('Network density', width=7),
str_wrap('Edge density', width=7),
str_wrap('No. components', width=7)
),
rows=rep('', nrow(network.characteristics.final)),
theme = ttheme(base_style="light", padding=unit(c(2,2), "mm"))
))

For Figure 1c: Distribution of correlation coefficients / weights
all.edges <- tibble()
for (host in names(networks)) {
G <- networks[[host]]
host_order = which(host == hosts)
edge.data <- G %>%
as_data_frame %>%
select(from, to, corr) %>%
mutate(host=host, host_order=host_order)
all.edges <- rbind(all.edges, edge.data)
}
all.edges <- all.edges %>%
mutate(
host = str_to_title(case_when(
host == 'canis_lupus' ~ 'Dog',
host == 'homo_sapiens' ~ 'Human',
TRUE ~ host
))
)
(p.edge.dist <- ggplot(all.edges) +
geom_histogram(aes(x=corr), color='white', binwidth = 0.01) +
facet_grid(fct_reorder(host, host_order)~., scales='free_y') +
xlab('Correlation') + ylab('Count') +
#theme_pubr()
theme_light() +
theme(axis.text.y = element_text(size=8))
)

For figure 1d:Overlapping nodes and edges in the networks
overlapping.edges <- matrix(nrow=length(networks), ncol=length(networks))
overlapping.nodes <- matrix(nrow=length(networks), ncol=length(networks))
colnames(overlapping.edges) <- rownames(overlapping.edges) <- names(networks)
colnames(overlapping.nodes) <- rownames(overlapping.nodes) <- names(networks)
for (hostSet in utils::combn(names(networks), m=2, simplify = F)) {
h1 = hostSet[1]
h2 = hostSet[2]
G1 <- networks[[h1]]
G2 <- networks[[h2]]
if (any(is.null(G1), is.null(G2))) {
next
}
overlapping.edges[h1, h2] <- length(intersect(E(G1), E(G2)))
overlapping.nodes[h1, h2] <- length( intersect(V(G1), V(G2)))
}
overlapping.edges <- as_tibble(overlapping.edges,rownames = 'h1') %>%
pivot_longer(-h1, names_to = 'h2')
overlapping.nodes <- as_tibble(overlapping.nodes,rownames = 'h1') %>%
pivot_longer(-h1, names_to = 'h2')
(p.overlaps <- ggplot() +
geom_tile(data=overlapping.nodes, aes(x=h1, y=h2, fill=value)) +
geom_text(data=overlapping.nodes, aes(x=h1, y=h2, label=value), color='white') +
scale_fill_viridis_c(str_wrap("Overlapping nodes", 11), na.value=NA) +
new_scale_fill() +
geom_tile(data=overlapping.edges, aes(y=h1, x=h2, fill=value)) +
geom_text(data=overlapping.edges, aes(y=h1, x=h2, label=value), color='white') +
scale_fill_viridis_c(str_wrap("Overlapping edges", 11), na.value=NA, option='H') +
scale_x_discrete(labels=c('Air', 'All', 'Dog', 'Chicken', 'Cow', 'Freshwater', 'Human', 'Marine', 'Mouse', 'Pig', 'Soil')) +
scale_y_discrete(labels=c('Air', 'All', 'Dog', 'Chicken', 'Cow', 'Freshwater', 'Human', 'Marine', 'Mouse', 'Pig', 'Soil')) +
theme_classic() +
theme(
axis.title = element_blank(),
legend.margin=margin(0,0,0,0)
)
)

combine
p.net_descs <- ggarrange(p.netchars, p.edge.dist, p.overlaps, ncol=3, labels=c('b', 'c', 'd'))
## Warning: Removed 66 rows containing missing values (geom_text).
## Warning: Removed 66 rows containing missing values (geom_text).
ggsave(plot=p.net_descs, filename='figs/fig1bcd_network_characteristics.png', width=20, height=6)
ggsave(plot=p.net_descs, filename='figs/fig1bcd_network_characteristics.pdf', width=20, height=6)
ggsave(plot=p.net_descs, filename='figs/fig1bcd_network_characteristics.tiff', width=20, height=6)
Figure 2: Correlation profiles for glycopeptide and macrolide
resistances
make_symmetric <- function(data, value.var) {
mat <- reshape2::dcast(data, AM1 ~ AM2, value.var=value.var)
rownames(mat) <- mat$AM1
mat <- mat[, -1]
return(mat)
}
pretty_melt <- function(data.upper, data.lower, data.upper.value = 'avgcorr', data.lower.value='n_edges') {
data.upper[lower.tri(data.upper)] <- NA
data.lower[upper.tri(data.lower)] <- NA
data.upper.long <- as_tibble(data.upper, rownames = 'AM1') %>%
pivot_longer(!'AM1', names_to='AM2', values_to=data.upper.value)
data.lower.long <- as_tibble(data.lower, rownames = 'AM1') %>%
pivot_longer(!'AM1', names_to='AM2', values_to=data.lower.value)
return(plyr::rbind.fill(data.upper.long,data.lower.long))
}
averageclasses.all <- tibble()#list(C1 = NA, C2 = NA, host=NA, avgcorr=NA, n_edges=NA))
for (host in names(networks)) {
G <- networks[[host]]
ho <- which(host == hosts)
edge.matrix <- as_adjacency_matrix(G,attr = 'corr', sparse = F, type='upper')
gene2class <- G %>% as_tibble() %>% select(name, ResFinder_class) %>%
separate_rows(ResFinder_class, sep='/')
averagesclasses <- tibble()
for (amcombi in combn(unique(gene2class$ResFinder_class), m=2,simplify = F)){
am.genes <- gene2class[gene2class$ResFinder_class %in% amcombi, ]$name
am.res <- reshape2::melt(edge.matrix[am.genes, am.genes], value.name = 'corr') %>%
filter(corr > 0) %>%
mutate(zscore = DescTools::FisherZ(corr)) %>%
summarise(meanzscore = mean(zscore), n_edges = n()) %>%
mutate(avgcorr = DescTools::FisherZInv(meanzscore), host=host, host_order=ho, AM1 = amcombi[1], AM2=amcombi[2])
averagesclasses <- rbind(averagesclasses, am.res)
}
averageclasses <- rbind(averagesclasses, averagesclasses %>% rename(AM2 = AM1, AM1=AM2))
avgcorr.mat <- make_symmetric(averageclasses,value.var='avgcorr')
n_edges.mat <- make_symmetric(averageclasses,value.var='n_edges')
averagedata <- pretty_melt(avgcorr.mat, n_edges.mat) %>%mutate(host=host, ho=ho)
averageclasses.all <- rbind(averageclasses.all, averagedata)
}
classesAM <- sort(unique(unlist(str_split(unique(resClasses$ResFinder_class), '/'))))
diag_df <- as_tibble(list(host=hosts)) %>% mutate(x=0,xend=length(classesAM)+1,y=0,yend=length(classesAM)+1)
averageclasses.all <- averageclasses.all %>%
filter(!(is.na(avgcorr) & is.na(n_edges))) %>%
mutate(
host = str_to_title(case_when(
host == 'canis_lupus' ~ 'Dog',
host == 'homo_sapiens' ~ 'Human',
TRUE ~ host
)
)
)
am_combis <- as_tibble(t(combn(classesAM, m=2, simplify=T)))
coord_data <- cbind(
do.call("rbind", replicate(length(hosts), am_combis, simplify=F)),
as_tibble(list(host=rep(hosts, each=nrow(am_combis)),host_order=rep(1:length(hosts), each=nrow(am_combis))))
) %>%
rename(Target=V1, y=V2)
#
coord_data <- rbind(coord_data, rename(coord_data, Target=y, y=Target))
ams <- list('Glycopeptide', 'Macrolide')
filt.coord.data <- coord_data %>% filter(Target %in% ams) %>%
mutate(host = str_to_title(case_when(
host == 'canis_lupus' ~ 'Dog',
host == 'homo_sapiens' ~ 'Human',
TRUE ~ host
)))
dot.data <- rbind(
filter(averageclasses.all, AM1 %in% ams, !is.na(avgcorr)) %>% rename(Target = AM1, y=AM2),
filter(averageclasses.all, AM2 %in% ams, !is.na(avgcorr)) %>% rename(Target = AM2, y=AM1)
) %>%
mutate(
host = str_to_title(case_when(
host == 'canis_lupus' ~ 'Dog',
host == 'homo_sapiens' ~ 'Human',
TRUE ~ host
)))
filt.coord.data %>%
left_join(dot.data, by=c("Target", "y", "host")) %>%
ggplot() +
geom_point(aes(x=reorder(host, host_order), y=y), shape=21, size=4, stroke=.1, fill='white') +
geom_point(mapping=aes(x=host, y=y, fill=avgcorr), shape=21, size=4, stroke=.1) +
# geom_point(mapping=aes(x=reorder(host, host_order), y=y), fill='white', shape=21, size=4, stroke=.1) +
scale_fill_viridis("Average\ncorrelation",direction=-1, limits=c(0.6,1), na.value='white') +
facet_wrap(~Target) +
theme_light() +
theme(axis.title=element_blank(), axis.text.x=element_text(angle=90, hjust=1, vjust=0.5))

ggsave(filename=paste0('figs/fig2_sel_cia_ams.png'), width=6*length(ams), height=6)
ggsave(filename=paste0('figs/fig2_sel_cia_ams.pdf'), width=6*length(ams), height=6)
Figure 3: Correation profiles for pleuromutilin and tetracycline
resistances
ams <- list('Tetracycline', 'Pleuromutilin')
filt.coord.data <- coord_data %>% filter(Target %in% ams) %>%
mutate(host = str_to_title(case_when(
host == 'canis_lupus' ~ 'Dog',
host == 'homo_sapiens' ~ 'Human',
TRUE ~ host
)))
dot.data <- rbind(
filter(averageclasses.all, AM1 %in% ams, !is.na(avgcorr)) %>% rename(Target = AM1, y=AM2),
filter(averageclasses.all, AM2 %in% ams, !is.na(avgcorr)) %>% rename(Target = AM2, y=AM1)
) %>%
mutate(
host = str_to_title(case_when(
host == 'canis_lupus' ~ 'Dog',
host == 'homo_sapiens' ~ 'Human',
TRUE ~ host
)))
filt.coord.data %>%
left_join(dot.data, by=c("Target", "y", "host")) %>%
ggplot() +
geom_point(aes(x=reorder(host, host_order), y=y), shape=21, size=4, stroke=.1, fill='white') +
geom_point(mapping=aes(x=host, y=y, fill=avgcorr), shape=21, size=4, stroke=.1) +
# geom_point(mapping=aes(x=reorder(host, host_order), y=y), fill='white', shape=21, size=4, stroke=.1) +
scale_fill_viridis("Average\ncorrelation",direction=-1, limits=c(0.6,1), na.value='white') +
facet_wrap(~Target) +
theme_light() +
theme(axis.title=element_blank(), axis.text.x=element_text(angle=90, hjust=1, vjust=0.5))

ggsave(filename=paste0('figs/fig3_sel_ia_ams.png'), width=6*length(ams), height=6)
ggsave(filename=paste0('figs/fig3_sel_ia_ams.pdf'), width=6*length(ams), height=6)
Supplementary figures
Figure S1: Metagenomic origins
Define colors for the various sampling sources
host_colors_palette <- tibble(host=c(hosts, 'other'), hostOrder=1:12, color=c(pals::tol(n=11), 'grey')) %>%
mutate(
hostGroup = str_to_title(case_when(
host == 'canis_lupus' ~ 'Dog',
host == 'homo_sapiens' ~ 'Human',
TRUE ~ host))
)
host_colors = setNames(host_colors_palette$color, host_colors_palette$hostGroup)
host_col_scale <- scale_colour_manual(name = "Sampling source", values = host_colors, limits = force)
host_fill_scale <- scale_fill_manual(name = "Sampling source", values = host_colors, limits = force)
Create sampling year plot
p.meta.year <- meta.data %>%
mutate(year=replace_na(year, 'Unknown'), hostGroup=replace_na(hostGroup, 'other')) %>%
group_by(hostGroup, year) %>%
summarise(n_runs=n_distinct(run_accession)) %>%
ungroup() %>%
mutate(
hostOrder = replace_na(mapply(get_host_order, hostGroup), length(hosts)+1),
hostGroup = str_to_title(case_when(
hostGroup == 'canis_lupus' ~ 'Dog',
hostGroup == 'homo_sapiens' ~ 'Human',
TRUE ~ hostGroup))
) %>%
ggplot() +
geom_col(aes(y=year, x=n_runs, fill=reorder(hostGroup, hostOrder))) +
host_fill_scale +
ylim(seq(2000, 2020, 1), 'Unknown') +
xlab('Sample count') +
ylab('Sampling year') +
theme_light()
## `summarise()` has grouped output by 'hostGroup'. You can override using the `.groups` argument.
meta.data %>%
filter(is.na(year)) %>%
dim()
## [1] 84238 13
Create overview of sequencing platforms
p.meta.platforms <- meta.data %>%
mutate(hostGroup = str_to_title(case_when(
hostGroup == 'canis_lupus' ~ 'Dog',
hostGroup == 'homo_sapiens' ~ 'Human',
is.na(hostGroup) ~ 'Other',
TRUE ~ hostGroup))) %>%
ggplot() +
geom_col(aes(x=raw_reads, y=instrument_platform, fill=hostGroup)) +
host_fill_scale +
facet_wrap(instrument_platform ~ ., scales='free', ncol=1) +
theme_light() +
theme(strip.text = element_blank(), strip.background = element_blank()) +
xlab('Count of raw sequencing reads') +
ylab('Instrument platform')
Create overview of sampling locations
library(maps)
map <- map_data('world')
gps.runs.host <- meta.data %>%
mutate(hostGroup = str_to_title(case_when(
hostGroup == 'canis_lupus' ~ 'Dog',
hostGroup == 'homo_sapiens' ~ 'Human',
is.na(hostGroup) ~ 'Other',
TRUE ~ hostGroup))) %>%
group_by(latitude, longitude, hostGroup) %>%
summarise(n_runs=n_distinct(run_accession))
gps.runs <- meta.data %>%
group_by(latitude, longitude) %>%
summarise(n_runs=n_distinct(run_accession)) %>%
mutate(hostGroup = 'all')
gps.runs.combi <- rbind(gps.runs, gps.runs.host) %>%
mutate(
hostOrder = mapply(get_host_order, hostGroup),
hostGroup = str_to_title(case_when(
hostGroup == 'canis_lupus' ~ 'Dog',
hostGroup == 'homo_sapiens' ~ 'Human',
TRUE ~ hostGroup))
)
p.meta.map <- ggplot() +
geom_polygon(data=map, aes(x=long, y = lat, group = group), fill="grey", alpha=0.3) +
geom_point(data=gps.runs.host , aes(x=longitude, y=latitude, size=n_runs, color=hostGroup), alpha=.5) +
host_col_scale +
scale_size('Number of samples', range=c(.5, 5)) +
theme_void() +
theme(strip.text.x = element_text(size=14)) +
guides(color=F)
rbind(
meta.data %>%
filter(is.na(latitude), is.na(longitude))%>%
summarise(n=n_distinct(run_accession)) %>%
mutate(hostGroup='all'),
meta.data %>%
group_by(hostGroup) %>%
filter(is.na(latitude), is.na(longitude))%>%
summarise(n=n_distinct(run_accession))
)
## # A tibble: 12 × 2
## n hostGroup
## <int> <chr>
## 1 83361 all
## 2 16 air
## 3 3159 canis_lupus
## 4 570 chicken
## 5 262 cow
## 6 61 freshwater
## 7 40003 homo_sapiens
## 8 363 marine
## 9 2842 mouse
## 10 320 pig
## 11 477 soil
## 12 35288 <NA>
Combine into one figure
ggarrange(
ggarrange(p.meta.year, p.meta.platforms, common.legend = T, legend = 'right', labels=c('a.', 'b.')),
p.meta.map, common.legend=F, ncol=1, labels=c('', 'c.'),heights = c(.75, 1))
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning: Removed 15 rows containing missing values (position_stack).
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning: Removed 15 rows containing missing values (position_stack).
## Warning: Removed 21 rows containing missing values (geom_point).

ggsave(filename='figs/figS1_run_metadata.png', width=14, height=10)
ggsave(filename='figs/figS1_run_metadata.pdf', width=14, height=10)
ggsave(filename='figs/figS1_run_metadata.tiff', width=14, height=10)
Figure S4: The relative abundance, number of ARGs, and number of
correlations for each resistance class
corrsPerClass <- tibble()
for (host in names(networks)) {
G <- networks[[host]]
if(host != 'all') {
c.data <- all.data[all.data$hostGroup == host,]
} else {
c.data = all.data
}
sum.arg <- sum(c.data$fragmentCountAln, na.rm=T)
ho = which(host == hosts)
for (resClass in unique(resClasses$ResFinder_class)) {
genes <- resClasses[resClasses$ResFinder_class == resClass, ]$name
m <- igraph::as_adjacency_matrix(G, sparse=F,type = 'upper')
corrsPerClass <- rbind(
corrsPerClass,
tibble(n=sum(m[colnames(m)[colnames(m) %in% genes],])+sum(m[,rownames(m)[rownames(m) %in% genes]]), host=host, resClass=resClass, host_order=ho)
)
}
}
corrsPerClass[corrsPerClass$n == 0, 'n'] <- NA
facet_labels = c(`n` = 'Number of correlations',
`fragClosed` = 'Relative abundance (%)',
`air` = 'Air',
`all` = 'All',
`chicken` = 'Chicken',
`cow` = 'Cow',
`canis_lupus` = 'Dog',
`marine` = 'Marine',
`homo_sapiens` = 'Human',
`mouse` = 'Mouse',
`freshwater` = 'Freshwater',
`soil` = 'Soil',
`pig` = 'Pig',
`nGeneClass` = 'Number of ARGs in class'
)
rbind(
select(sum.gene.counts2,hostOrder, hostGroup, ResFinder_class, fragClosed, nGeneClass) %>%
pivot_longer(!c(hostOrder, hostGroup, ResFinder_class)) %>%
rename(host=hostGroup, host_order=hostOrder, resClass=ResFinder_class) %>%
mutate(name_order=case_when(
name == 'fragClosed' ~ 1,
name == 'nGeneClass' ~ 2,
)),
corrsPerClass %>%
pivot_longer(!c(host, host_order, resClass)) %>%mutate(name_order=3)
) %>%
ggplot(aes(x=value, y=resClass, fill=name)) +
geom_bar(stat='identity') +
scale_fill_manual(guide="none", values=list("n" = "#E69F00", "fragClosed"="#009E73", "nGeneClass" = '#999999')) +
facet_wrap(fct_reorder(host, host_order)~fct_reorder(name, name_order), scales='free_x',ncol = 6, labeller=as_labeller(facet_labels)) +
xlab('') + ylab('') +
theme_light()
## Warning: Removed 162 rows containing missing values (position_stack).

ggsave('figs/figS4_classes_rel_corrs.png', width=30, height=40)
## Warning: Removed 162 rows containing missing values (position_stack).
ggsave('figs/figS4_classes_rel_corrs.pdf', width=30, height=40)
## Warning: Removed 162 rows containing missing values (position_stack).
ggsave('figs/figS4_classes_rel_corrs.tiff', width=30, height=40)
## Warning: Removed 162 rows containing missing values (position_stack).
Figure S5: Networks illustrating how one ARG interacts differently
depending on the sampling environments
gene.regexes <- c("catA1", "mef\\(A\\)_1", "tet\\(L\\)_4")
#all.gene.graphplots <- list()
gene.graphplots <- list()
for (gene.regex in gene.regexes) {
add_legend = length(gene.regexes) == which(gene.regex == gene.regexes)
for (host in hosts) {
G <- networks[[host]]
G.sel <- extract_neighborhood(gene.regex, G)
if (is.null(G.sel)) {
p.h <- ggplot() + ggtitle(str_to_title(host)) + theme_void() + theme_graph(base_family="sans") + annotate("text",label="No correlations", x=0, y=0)
} else {
p.h <- G.sel %>%
activate(nodes) %>%
left_join(filter(gene.rels, hostGroup == host), by=c("name" = "refSequence")) %>%
mutate(name=sub('_[^_]+$', '', name)) %>%
ggraph(layout="nicely") +
geom_edge_link0(aes(color=corr, width=corr, alpha=corr)) +
geom_node_point(aes(color=ResFinder_class, size=fragClo, shape=sel)) +
geom_node_label(aes(label=name), size=2, repel = T) +
scale_edge_width(range=c(.2, 1), guide="none") +
scale_edge_alpha(range=c(0.5, 0.9), guide="none") +
scale_edge_color_viridis("Correlation",direction=-1,limits=c(0.6, 1)) +
scale_size_continuous('Relative abundance', range=c(1, 4), breaks=c(5, 10, 25, 50, 75, 100)) +
scale_color_manual("ResFinder class", values=classes_colors) +
scale_shape_manual("Highlighted", values=list("Yes" = 16, "No" = 15))+
ggtitle(str_to_title(host)) +
theme_graph(base_family="sans")
if(!add_legend){
p.h <- p.h + guides(colour="none", shape="none", size="none", edge_color="none")
}
# if (add_legend) {
# p.h <- p.h +
# scale_color_manual("ResFinder class", values=classes_colors) +
# scale_edge_color_viridis("Correlation",direction=-1,limits=c(0.6, 1)) +
# scale_size('Relative abundance', range=c(1, 4)) +
# scale_shape_manual("Highlighted", values=list("Yes" = 16, "No" = 15))
#
# } else {
# p.h <- p.h +
# scale_color_manual(guide="none", values=classes_colors) +
# scale_edge_color_viridis(guide="none",direction=-1,limits=c(0.6, 1)) +
# scale_shape_manual(guide="none", values=list("Yes" = 16, "No" = 15)) +
# scale_size(guide="none", range=c(1, 4))
# }
}
gene.graphplots <- c(gene.graphplots, list(p.h))
}
#print(paste(gene.regex, length(gene.graphplots)))
#p.g <- ggarrange(plotlist=gene.graphplots, nrow=1, common.legend = T, legend="bottom")
#all.gene.graphplots <- c(all.gene.graphplots, list(p.g))
}
p.gp.all <- ggarrange(
plotlist=gene.graphplots,
ncol=11, nrow=length(gene.regexes),
common.legend = T,
labels = c("a. catA1_1", rep("", 10), "b. mef(A)_1", rep("", 10), "c. tet(L)_4", rep("", 10)),
font.label = list(size = 18, color = "black", face = "bold", family = NULL)
)
ggsave(plot=p.gp.all, filename = "figs/figS5_network_sel.png", width=30,height=15)
ggsave(plot=p.gp.all, filename = "figs/figS5_network_sel.eps", width=30,height=15, family="sans")
ggsave(plot=p.gp.all, filename = "figs/figS5_network_sel.pdf", width=30,height=15)
Figure S7: Highlights of interactions between ARGs of glycopeptide
and macrolide resistances (CIA)
net.titles.host <- c("mouse" = "Mouse", "homo_sapiens" = "Human", "freshwater" = "Freshwater", "pig" = "Pig", "cow" = "Cow", "chicken" = "Chicken")
sel.nets.classes <- list(
"glycopeptide_highlights" = c("homo_sapiens" = "Glycopeptide", "pig"= "Glycopeptide"),
"macrolide_highlights" = c("homo_sapiens" = "Macrolide", "pig" = "Macrolide", "cow" = "Macrolide", "chicken" = "Macrolide")
)
all.sets <- list()
for (fname in names(sel.nets.classes)) {
nets.highlights <- list()
node_col_scale = scale_color_manual(guide="none", values=classes_colors)
for(host in sort(names(sel.nets.classes[[fname]]))){
am = sel.nets.classes[[fname]][[host]]
G <- networks[[host]]
# define layout
e <- get.edgelist(G, names=F)
layout <- qgraph.layout.fruchtermanreingold(e, vcount = vcount(G),
area = 6*(vcount(G)^2), repulse.rad = vcount(G)^2.5)
node.sel <- as_tibble(G) %>% rownames_to_column() %>% filter(str_detect(ResFinder_class, am))
node.ids <- as.numeric(node.sel$rowname)
G2 <- G %>%
activate(nodes) %>%
mutate(x=layout[,1], y=layout[,2])
neighborhoods <- c()
for (node.id in node.ids) {
neighborhood <- G %>%
convert(to_local_neighborhood, node=node.id, order=1, mode="all") %>%
activate(edges) %>%
mutate(highlight = T) %>%
activate(nodes) %>%
mutate(highlight = T)
G2 <- G2 %>% bind_edges(as_data_frame(neighborhood, what="edges"))
neighborhoods <- c(neighborhoods, V(neighborhood)$name)
}
G2 %>%
activate(nodes) %>%
mutate(
focus_node = name %in% node.sel$name
)
p.neighborhoods <- G2 %>%
activate(nodes) %>%
mutate(
highlight = name %in% neighborhoods,
gene = sub('_[^_]+$', '', name),
focus_node = case_when(
name %in% node.sel$name ~1,
T ~ 0
)
) %>%
ggraph(layout="manual", x=x, y=y) +
geom_edge_link(aes(filter=is.na(highlight)), color="gray", width=.5, alpha=.25) +
geom_edge_link(aes(filter=!is.na(highlight), color=corr), width=.5, alpha=.5) +
geom_node_point(aes(color=ResFinder_class, filter=highlight == T, size=focus_node)) +
geom_node_text(aes(label=gene, filter=highlight == T, color=ResFinder_class), repel=T, size=4) +
scale_size(breaks=1, labels=c(paste('Confers resistance\nto', am)), range=c(1,3),name = '') +
# geom_node_label(aes(label=name), size=2, repel = T)
# scale_edge_color_viridis("Correlation",direction=-1, limits=c(0.6, 1)) +
scale_edge_color_viridis(guide="none",direction=-1, limits=c(0.6, 1)) +
node_col_scale +
ggtitle(net.titles.host[[host]]) +
theme_graph(base_family="sans") +
theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5, size = 14))
nets.highlights <- c(nets.highlights, list(p.neighborhoods))
}
p.sel<-ggarrange(plotlist=nets.highlights, ncol=2,nrow=length(sel.nets.classes[[fname]])/2, common.legend = T, legend="right")
ggsave(filename=paste0('figs/',fname, '.png'), height=7*length(sel.nets.classes[[fname]])/2, width=5*length(sel.nets.classes[[fname]]))
all.sets <- c(all.sets, list(p.sel))
}
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
p.cia <- ggarrange(plotlist=all.sets, ncol=1, labels = c('a. Glycopeptide', 'b. Macrolide'), heights = c(1, 2), font.label = list(size=16))
ggsave(plot = p.cia, filename='figs/figS7_cia_highlights.png', width=14, height=21)
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
ggsave(plot = p.cia, filename='figs/figS7_cia_highlights.pdf', width=14, height=21)
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
ggsave(plot = p.cia, filename='figs/figS7_cia_highlights.tiff', width=14, height=21)
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Figure S8: Highlights of interactions between ARGs of pleuromutilin
and tetracycline resistances (HIA/IA)
asel.nets.classes <- list(
"pleuromutiline_highlights" =c("homo_sapiens" = "Pleuromutilin", "pig" = "Pleuromutilin", "cow" = "Pleuromutilin", "chicken" = "Pleuromutilin"),
"tetracycline_highlights" = c("homo_sapiens" = "Tetracycline", "pig" = "Tetracycline", "cow" = "Tetracycline", "chicken" = "Tetracycline")
)
all.sets <- list()
for (fname in names(sel.nets.classes)) {
nets.highlights <- list()
node_col_scale = scale_color_manual(guide="none", values=classes_colors)
for(host in sort(names(sel.nets.classes[[fname]]))){
am = sel.nets.classes[[fname]][[host]]
G <- networks[[host]]
# define layout
e <- get.edgelist(G, names=F)
layout <- qgraph.layout.fruchtermanreingold(e, vcount = vcount(G),
area = 6*(vcount(G)^2), repulse.rad = vcount(G)^2.5)
node.sel <- as_tibble(G) %>% rownames_to_column() %>% filter(str_detect(ResFinder_class, am))
node.ids <- as.numeric(node.sel$rowname)
G2 <- G %>%
activate(nodes) %>%
mutate(x=layout[,1], y=layout[,2])
neighborhoods <- c()
for (node.id in node.ids) {
neighborhood <- G %>%
convert(to_local_neighborhood, node=node.id, order=1, mode="all") %>%
activate(edges) %>%
mutate(highlight = T) %>%
activate(nodes) %>%
mutate(highlight = T)
G2 <- G2 %>% bind_edges(as_data_frame(neighborhood, what="edges"))
neighborhoods <- c(neighborhoods, V(neighborhood)$name)
}
G2 %>%
activate(nodes) %>%
mutate(
focus_node = name %in% node.sel$name
)
p.neighborhoods <- G2 %>%
activate(nodes) %>%
mutate(
highlight = name %in% neighborhoods,
gene = sub('_[^_]+$', '', name),
focus_node = case_when(
name %in% node.sel$name ~1,
T ~ 0
)
) %>%
ggraph(layout="manual", x=x, y=y) +
geom_edge_link(aes(filter=is.na(highlight)), color="gray", width=.5, alpha=.25) +
geom_edge_link(aes(filter=!is.na(highlight), color=corr), width=.5, alpha=.5) +
geom_node_point(aes(color=ResFinder_class, filter=highlight == T, size=focus_node)) +
geom_node_text(aes(label=gene, filter=highlight == T, color=ResFinder_class), repel=T, size=4) +
scale_size(breaks=1, labels=c(paste('Confers resistance\nto', am)), range=c(1,3),name = '') +
# geom_node_label(aes(label=name), size=2, repel = T)
# scale_edge_color_viridis("Correlation",direction=-1, limits=c(0.6, 1)) +
scale_edge_color_viridis(guide="none",direction=-1, limits=c(0.6, 1)) +
node_col_scale +
ggtitle(net.titles.host[[host]]) +
theme_graph(base_family="sans") +
theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5, size = 14))
nets.highlights <- c(nets.highlights, list(p.neighborhoods))
}
p.sel<-ggarrange(plotlist=nets.highlights, ncol=2,nrow=length(sel.nets.classes[[fname]])/2, common.legend = T, legend="right")
ggsave(filename=paste0('figs/',fname, '.png'), height=7*length(sel.nets.classes[[fname]])/2, width=5*length(sel.nets.classes[[fname]]))
all.sets <- c(all.sets, list(p.sel))
}
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
p.hia <- ggarrange(plotlist=all.sets, ncol=1, labels = c('a. Pleuromutilin', 'b. Tetracycline'),font.label = list(size=16))
ggsave(plot = p.hia,filename='figs/figS8_ia_highlights.png', width=14, height=21)
## Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
ggsave(plot = p.hia,filename='figs/figS8_ia_highlights.pdf', width=14, height=21)
## Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
ggsave(plot = p.hia,filename='figs/figS8_ia_highlights.tiff', width=14, height=21)
## Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps